Note: run intall.packages() before loading the packages.
# general data maniputlation: summarise, filter, etc.
# install.packages("dplyr")
library(dplyr)
#install.packages("plyr")
library(plyr)
# manipulation of date/time data
#install.packages("chron")
library(chron)
# interactive plots
#install.packages("scatterD3")
library(scatterD3)
#install.packages("plotly")
library(plotly)
#install.packages("RCurl")
library(RCurl)
lines <- readLines(textConnection(getURL("https://raw.githubusercontent.com/dustinvanstee/nba-rt-prediction/master/scores_nba.test.dat")))
head(lines)
## [1] "2016-04-05,15:06:16,Phoenix,0,Atlanta,0,(8:00 PM ET),48.0,400829044" "2016-04-05,15:06:16,Chicago,0,Memphis,0,(8:00 PM ET),48.0,400829045" "2016-04-05,15:06:16,Cleveland,0,Milwaukee,0,(8:00 PM ET),48.0,400829046" "2016-04-05,15:06:16,Oklahoma City,0,Denver,0,(9:00 PM ET),48.0,400829047" "2016-04-05,15:06:16,New Orleans,0,Philadelphia,0,(7:00 PM ET),48.0,400829041" "2016-04-05,15:06:16,Detroit,0,Miami,0,(8:00 PM ET),48.0,400829042"
# turn the .dat file to dataframe
nba_scores_DF <- as.data.frame(do.call(rbind, strsplit(lines, ",")), stringsAsFactors=FALSE)
## Warning in (function (..., deparse.level = 1) : number of columns of result is not a multiple of vector length (arg 14628)
# Since I don't have a header in the data set, I want to specify the column metadata
colnames(nba_scores_DF) <- c("dateOrig","ts","teamlonga", "scorea", "teamlongb", "scoreb", "timestring", "timeleft", "gameid")
nba_scores_DF2 <- transform(nba_scores_DF,
dateOrig = as.Date(dateOrig),
ts = as.character(ts),
teamlonga = as.character(teamlonga),
scorea = as.numeric (scorea),
teamlongb = as.character(teamlongb),
scoreb = as.numeric (scoreb),
timestring = as.character(timestring),
timeleft = as.numeric(timeleft),
gameid = as.character(gameid))
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
This data is the raw input that contains a record for each update of the game. Data has some errors and redundancies that must be removed. Will discuss that as we go along …. in particular, we need to seperate the in game scores and the final score and re-merge them for our model
# NAs are introduced because the raw data has invalid data points, so remove these observations
rtscoresAndFinalDF <- na.omit(nba_scores_DF2)
dim(rtscoresAndFinalDF) #16746 9
## [1] 16746 9
head(rtscoresAndFinalDF)
## dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid
## 1 2016-04-05 15:06:16 Phoenix 0 Atlanta 0 (8:00 PM ET) 48 400829044
## 2 2016-04-05 15:06:16 Chicago 0 Memphis 0 (8:00 PM ET) 48 400829045
## 3 2016-04-05 15:06:16 Cleveland 0 Milwaukee 0 (8:00 PM ET) 48 400829046
## 4 2016-04-05 15:06:16 Oklahoma City 0 Denver 0 (9:00 PM ET) 48 400829047
## 5 2016-04-05 15:06:16 New Orleans 0 Philadelphia 0 (7:00 PM ET) 48 400829041
## 6 2016-04-05 15:06:16 Detroit 0 Miami 0 (8:00 PM ET) 48 400829042
head(filter(rtscoresAndFinalDF, grepl("FINAL", timestring)))
## dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid
## 1 2016-04-05 21:22:09 New Orleans 93 Philadelphia 107 (FINAL) 0 400829041
## 2 2016-04-05 22:08:42 Charlotte 90 Toronto 96 (FINAL) 0 400829043
## 3 2016-04-05 22:25:25 Chicago 92 Memphis 108 (FINAL) 0 400829045
## 4 2016-04-05 22:28:58 Phoenix 90 Atlanta 103 (FINAL) 0 400829044
## 5 2016-04-05 22:30:29 Cleveland 109 Milwaukee 80 (FINAL) 0 400829046
## 6 2016-04-05 22:30:29 Detroit 89 Miami 107 (FINAL) 0 400829042
head(filter(rtscoresAndFinalDF, grepl("1ST", timestring)))
## dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid
## 1 2016-04-05 19:23:42 New Orleans 23 Philadelphia 12 (4:39 IN 1ST) 40.65000 400829041
## 2 2016-04-05 19:23:57 New Orleans 23 Philadelphia 14 (4:05 IN 1ST) 40.08333 400829041
## 3 2016-04-05 19:24:13 New Orleans 23 Philadelphia 14 (3:41 IN 1ST) 39.68333 400829041
## 4 2016-04-05 19:24:28 New Orleans 23 Philadelphia 14 (3:32 IN 1ST) 39.53333 400829041
## 5 2016-04-05 19:24:43 New Orleans 23 Philadelphia 16 (3:24 IN 1ST) 39.40000 400829041
## 6 2016-04-05 19:25:29 New Orleans 23 Philadelphia 16 (3:11 IN 1ST) 39.18333 400829041
These were a couple custom UDF’s I needed to cleanse the data and also to add a few features based on a proprietary way of combining the score with the time left.
# Function to turn long team name to short
teamMap <- function(x) {
tnames <- data.frame(
long = as.factor(c("Atlanta", "Boston", "Brooklyn", "Charlotte", "Chicago",
"Cleveland", "Dallas", "Denver", "Detroit", "Golden State",
"Houston","Indiana", "LA Clippers", "LA Lakers", "Memphis",
"Miami", "Milwaukee", "Minnesota", "New Orleans", "New York",
"Oklahoma City", "Orlando", "Philadelphia", "Phila.", "Phoenix",
"Portland", "Sacramento", "San Antonio", "Toronto", "Utah", "Washington")),
short = as.factor(c("atl", "bos", "bkn", "cha", "chi",
"cle", "dal", "den", "det", "gst",
"hou", "ind", "lac", "lal", "mem",
"mia", "mil", "min", "nor", "nyk",
"okc", "orl", "phi", "phi", "pho",
"por", "sac", "san", "tor", "uta", "wsh"))
)
df_x <- data.frame(long=x)
short <- tnames$short[match(df_x$long, tnames$long)]
return(short)
}
# Function to convert 3-character month to 2-digit numeric month
monthMap <-function(x) {
a <-data.frame(
str = as.factor(c("Jan", "Feb", "Mar", "Apr", "May",
"Jun", "Jul", "Aug", "Sep", "Oct",
"Nov", "Dec")),
num = as.factor(c("01", "02", "03", "04", "05",
"06", "07", "08", "09", "10",
"11", "12"))
)
df_x <- data.frame(str=x)
num <- a$num[match(df_x$str, a$str)]
return(num)
}
# Date Logic to adjust for games that finish on the day after ....
# This is due to not having a great key to join my tables ...
dateadjustudf <- function(datein, tsin){
newdate <- c()
for (i in 1:length(tsin)){
if (grepl("^0[0-3]", tsin[i])) {
newdate[i] = datein[i] - 1
} else {
newdate[i] = datein[i]
}
}
return(newdate)
}
# UDFs to create some extra features ... this one is for an experiemental combination of Time left and Score difference.
# Made this via intuition. This can be extended to add other custom features
# val crossOverTime = 8
# val exponentScaler = 0.5
# There is no need to create UDFs here
Here I create some extra columns for later use.
# Remove Overtime games from this analysis
rtscoresAndFinalDF <- filter(rtscoresAndFinalDF, !grepl(".*OT.*", timestring))
#16626
# Create short 3 character team names
rtscoresAndFinalDF$teama <- teamMap(rtscoresAndFinalDF$teamlonga)
rtscoresAndFinalDF$teamb <- teamMap(rtscoresAndFinalDF$teamlongb)
# Add a score differential Column
rtscoresAndFinalDF$scorea_scoreb <- rtscoresAndFinalDF$scorea - rtscoresAndFinalDF$scoreb
# Transform the Date. This is for games that spanned multiple days and gave me a headache.
# Games adjusted to the day they started on.
rtscoresAndFinalDF$date <- dateadjustudf(rtscoresAndFinalDF$dateOrig, rtscoresAndFinalDF$ts)
rtscoresAndFinalDF$date <- as.Date(rtscoresAndFinalDF$date, origin = "1970-01-01")
# Create a Key for me to use to join with odds data later. Key = date.teama.teamb
for (i in 1:nrow(rtscoresAndFinalDF)){
rtscoresAndFinalDF$key[i] <- paste0(rtscoresAndFinalDF$date[i], ".", rtscoresAndFinalDF$teama[i], ".", rtscoresAndFinalDF$teamb[i])
}
Currently based on the way the data was sampled, both real time scores and final scores are written as seperate records to the same file. I need to pull these apart, and then join the dataframes so that I have a real time score and features and know if the game was won or lost ….
# Create Final Score DF
# Note a shortcut for repeating the dataframe within the filter is to use a $df.filter(df("foo").contains ... is equiv to df.filter($"foo".contains)
finalscoresDF <- filter(rtscoresAndFinalDF, grepl("FINAL", timestring))
# Rename some columns so that join later doesnt have name overlaps
finalscoresDF$fscorea <- finalscoresDF$scorea
finalscoresDF$fscoreb <- finalscoresDF$scoreb
# Create final score difference
finalscoresDF$fscorea_fscoreb <- finalscoresDF$fscorea - finalscoresDF$fscoreb
finalscoresDF$fscoreb_fscorea <- finalscoresDF$fscoreb - finalscoresDF$fscorea
# Add a Win/loss column Win = 1, Loss = 0
for (i in 1 : nrow(finalscoresDF)){
if (finalscoresDF$fscorea_fscoreb[i] > 0){
finalscoresDF$home_win[i] <- 0
finalscoresDF$away_win[i] <- 1
} else {
finalscoresDF$home_win[i] <- 1
finalscoresDF$away_win[i] <- 0
}
}
#################################################################################################################
# Create Real time score DF and more wrangling
# Remove Halftime records and these other cases as my datasource doesnt always change the quarter well
# as this particular case isn't handled well... (for now)
rtscoresDF <- filter(rtscoresAndFinalDF, !grepl('HALF', timestring), !grepl('FINAL', timestring),
timestring != "(12:00 IN 1ST)" ,
timestring != "(12:00 IN 2ND)" ,
timestring != "(12:00 IN 3RD)" ,
timestring != "(12:00 IN 4TH)" ,
timestring != "(END OF 1ST)" ,
timestring != "(END OF 2ND)" ,
timestring != "(END OF 3RD)" ,
timestring != "(END OF 4TH)" )
# Create real time score difference
rtscoresDF$scorea_scoreb <- rtscoresDF$scorea - rtscoresDF$scoreb
rtscoresDF$scoreb_scorea <- rtscoresDF$scoreb - rtscoresDF$scorea
# Create a game PCT complete and PCT left indictor
rtscoresDF$pct_complete <- (((rtscoresDF$timeleft * -1) + 48 )/48.0)*100
rtscoresDF$pct_left <- 100 - rtscoresDF$pct_complete
# Create a unique feature. Idea here is that I have intuition that timeleft and score difference are a strong predictor when combined
rtscoresDF$cf1 <- (1/((rtscoresDF$pct_left/25 + .01)^.5)) * rtscoresDF$scoreb_scorea
rtscoresDF$cf2 <- (1/((rtscoresDF$pct_left/2.0 + .01)^1.3))*rtscoresDF$scoreb_scorea
After building my initial model, I noticed that the logistic model was adjusting the probabilities well at the end of the games. I had some examples where I had 0 time left in the game, and yet the logistic model was giving a 70% chance of victory for a team. I speculated this was due to the fact that my original features were not fitting the end of game very well. To fix this, I created a spreader custom feature that basically takes the score difference and amplifies it as the score nears the end of the game. This way this feature is very predictive at the end of games and can help adjust the probablities to be more certain at the end of games.
# subset a dataframe for scatterplot
spreader <- filter(rtscoresDF, pct_complete < 95)
# draw interactive scatter plot
scatterD3(x = spreader$pct_complete, y = spreader$scoreb_scorea, col_var = spreader$key)
scatterD3(x = spreader$pct_complete, y = spreader$cf2, col_var = spreader$key)
***
## [1] "final scores data frame"
## dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid teama teamb scorea_scoreb date key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea home_win away_win
## 1 2016-04-05 21:22:09 New Orleans 93 Philadelphia 107 (FINAL) 0 400829041 nor phi -14 2016-04-05 2016-04-05.nor.phi 93 107 -14 14 1 0
## 2 2016-04-05 22:08:42 Charlotte 90 Toronto 96 (FINAL) 0 400829043 cha tor -6 2016-04-05 2016-04-05.cha.tor 90 96 -6 6 1 0
## 3 2016-04-05 22:25:25 Chicago 92 Memphis 108 (FINAL) 0 400829045 chi mem -16 2016-04-05 2016-04-05.chi.mem 92 108 -16 16 1 0
## 4 2016-04-05 22:28:58 Phoenix 90 Atlanta 103 (FINAL) 0 400829044 pho atl -13 2016-04-05 2016-04-05.pho.atl 90 103 -13 13 1 0
## 5 2016-04-05 22:30:29 Cleveland 109 Milwaukee 80 (FINAL) 0 400829046 cle mil 29 2016-04-05 2016-04-05.cle.mil 109 80 29 -29 0 1
## 6 2016-04-05 22:30:29 Detroit 89 Miami 107 (FINAL) 0 400829042 det mia -18 2016-04-05 2016-04-05.det.mia 89 107 -18 18 1 0
## [1] "Total Games = 116"
## [1] "real time scores data frame"
## dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid teama teamb scorea_scoreb date key scoreb_scorea pct_complete pct_left cf1 cf2
## 1 2016-04-05 15:06:16 Phoenix 0 Atlanta 0 (8:00 PM ET) 48 400829044 pho atl 0 2016-04-05 2016-04-05.pho.atl 0 0 100 0 0
## 2 2016-04-05 15:06:16 Chicago 0 Memphis 0 (8:00 PM ET) 48 400829045 chi mem 0 2016-04-05 2016-04-05.chi.mem 0 0 100 0 0
## 3 2016-04-05 15:06:16 Cleveland 0 Milwaukee 0 (8:00 PM ET) 48 400829046 cle mil 0 2016-04-05 2016-04-05.cle.mil 0 0 100 0 0
## 4 2016-04-05 15:06:16 Oklahoma City 0 Denver 0 (9:00 PM ET) 48 400829047 okc den 0 2016-04-05 2016-04-05.okc.den 0 0 100 0 0
## 5 2016-04-05 15:06:16 New Orleans 0 Philadelphia 0 (7:00 PM ET) 48 400829041 nor phi 0 2016-04-05 2016-04-05.nor.phi 0 0 100 0 0
## 6 2016-04-05 15:06:16 Detroit 0 Miami 0 (8:00 PM ET) 48 400829042 det mia 0 2016-04-05 2016-04-05.det.mia 0 0 100 0 0
## [1] "Total Number of rt score records = 15947"
How to Read the Raw Odds data
Example Golden State -12.5 O (207.0) -125.0 | Detroit 12.5 U (207.0) 145.0
The away team is listed first, and the home team is second
Here Golden State is a 12.5 pt favorite to win. The over under is in parentheses (207) and is the 50/50 line between teams sum of scores
being above/below that line.
Finally the -125 / +145 numbers are whats known at the moneyline odds.
A negative number means you need to bet 125$ to get a 100$ payout
A positive number means you need to bet 100$ to get a 145$ payout
xml <- readLines(textConnection(getURL("https://raw.githubusercontent.com/dustinvanstee/nba-rt-prediction/master/nbaodds_042516.xml")))
# use regular expression to catch info we need
odds <- lapply(xml, function(x) substr(x, regexpr(">", x) + 1, regexpr("/", x) - 2))
odds_split <- lapply(odds, function(x) unlist(strsplit(x, " ")))
# get teamlonga
teamlonga_0 <- lapply(odds_split, function(x) paste(x[1], x[2]))
teamlonga <- lapply(teamlonga_0, function(x){
if (regexpr("[0-9|-]", x) > -1) {
substr(x, 1, regexpr("[0-9|-]", x)-2)
} else{
x
}
})
# get teamlongb
teamlongb_0 <- lapply(odds_split, function(x) paste(x[7],x[8], x[9]))
teamlongb_1 <- lapply(teamlongb_0, function(x){
if (regexpr("[0-9]", x) > -1) {
substr(x, regexpr("[A-Za-z]", x), regexpr("[0-9-]", x)-2)
} else{
x
}
})
teamlongb <- lapply(teamlongb_1, function(x){
if (regexpr("|", x) > -1){
substr(x, regexpr("[A-Za-z]", x), nchar(x))
} else {
x
}
})
# teamaspread
teamaspread_0 <- lapply(odds, function(x){
substr(x, regexpr("[0-9-]",x), regexpr("[0-9-]",x)+4)
})
teamaspread <- lapply(teamaspread_0, function(x){
if (regexpr("[ ]", x) > 0){
substr(x, 1, regexpr("[ ]", x)-1)
} else {
x
}
})
# overunder
overunder <- lapply(odds, function(x){
substr(x, regexpr("[(]", x) + 1, regexpr("[)]", x) - 1)
})
# teamaml
teamaml <- lapply(odds, function(x){
substr(x,regexpr("[)]", x) + 2, regexpr("[|]", x) - 2 )
})
# teambml
teambml <- lapply(odds, function(x){
substr(x, gregexpr("[)]", x)[[1]][2]+2, gregexpr("[(]", x)[[1]][3]-2)
})
#get date
dateStr <- lapply(odds, function(x){
month <- substr(x, gregexpr("[(]", x)[[1]][3]+1, gregexpr("[(]", x)[[1]][3]+3)
month_num <- monthMap(month)
date <- substr(x, gregexpr("[(]", x)[[1]][3]+5, gregexpr("[(]", x)[[1]][3]+6)
year <- substr(x, gregexpr("[(]", x)[[1]][3]+9, gregexpr("[(]", x)[[1]][3]+12)
paste0(year, "-", month_num, "-", date)
})
# get short team names
teama <- lapply(teamlonga, teamMap)
teamb <- lapply(teamlongb, teamMap)
# bind all column together into dataframe
oddsDF <- na.omit(do.call(rbind, Map(data.frame, teamlonga=teamlonga, teama=teama, teamlongb=teamlongb, teamb=teamb, teamaspread=teamaspread, overunder=overunder, teamaml=teamaml, teambml=teambml, dateStr=dateStr)))
# change to right data type and create a key for join later
oddsDF$teamaspread <- as.numeric(as.character(oddsDF$teamaspread))
oddsDF$overunder <- as.numeric(as.character(oddsDF$overunder))
oddsDF$teamaml <- as.numeric(as.character(oddsDF$teamaml))
oddsDF$teambml <- as.numeric(as.character(oddsDF$teambml))
oddsDF$teama <- as.character(oddsDF$teama)
oddsDF$teamb <- as.character(oddsDF$teamb)
oddsDF$key <- paste0(oddsDF$dateStr, ".", oddsDF$teama, ".", oddsDF$teamb)
dim(oddsDF) #161 10
## [1] 161 10
# add the groupby and average below because I was getting the game odds over multiple days, and it was adding noise to the analysis
oddsDF2 <- ddply(oddsDF, c("key", "teamlonga", "teamlongb", "teama", "teamb", "dateStr"), summarise,
teamaspread = mean(teamaspread),
overunder = mean(overunder),
teamaml = mean(teamaml),
teambml = mean(teambml))
# Create a few new columns for later analysis
oddsDF2$teambspread <- oddsDF2$teamaspread * -1
oddsDF2$teama_vegas_fscore <- (oddsDF2$overunder / 2.0) - (oddsDF2$teamaspread / 2.0)
oddsDF2$teamb_vegas_fscore <- (oddsDF2$overunder / 2.0) + (oddsDF2$teamaspread / 2.0)
head(oddsDF2)
## key teamlonga teamlongb teama teamb dateStr teamaspread overunder teamaml teambml teambspread teama_vegas_fscore teamb_vegas_fscore
## 1 2016-04-05.cha.tor Charlotte Toronto cha tor 2016-04-05 4.0 200.5 155 -175 -4.0 98.25 102.25
## 2 2016-04-05.chi.mem Chicago Memphis chi mem 2016-04-05 -3.0 201.5 -150 130 3.0 102.25 99.25
## 3 2016-04-05.cle.mil Cleveland Milwaukee cle mil 2016-04-05 -7.5 203.0 -340 280 7.5 105.25 97.75
## 4 2016-04-05.det.mia Detroit Miami det mia 2016-04-05 4.0 202.0 160 -190 -4.0 99.00 103.00
## 5 2016-04-05.lal.lac LA Lakers LA Clippers lal lac 2016-04-05 14.5 208.0 -110 -110 -14.5 96.75 111.25
## 6 2016-04-05.min.gst Minnesota Golden State min gst 2016-04-05 15.5 225.0 -110 -110 -15.5 104.75 120.25
paste("total home teams = ", length(unique(oddsDF2$teama)))
## [1] "total home teams = 30"
paste("total away teams = ", length(unique(oddsDF2$teamb)))
## [1] "total away teams = 30"
paste("total games collected = ", nrow(oddsDF2))
## [1] "total games collected = 111"
Here we are averaging the away spread per team. If the bar is above the zero line, then the team is an underdog, and under the line the team is the favorite. 8 of the 32 teams were favorites on the road… and they are the likely suspect including CLE/GST/OKC
# visualize away spread data
avg_away_spread <- ddply(oddsDF2, c("teamlonga", "teamlongb"), summarise,
awayspread_avg_teamaspread = mean(teamaspread),
awayspread_avg_teambspread = mean(teambspread))
# away spread group by teama
away_spread_teama <- ddply(avg_away_spread, c("teamlonga"), summarise,
teamaspread = mean(awayspread_avg_teamaspread))
# order by teama
away_spread_teama$teamlonga <- as.character(away_spread_teama$teamlonga)
away_spread_teama <- away_spread_teama[order(away_spread_teama$teamlonga), ]
# barchart
p <- plot_ly(
x = away_spread_teama$teamlonga,
y = away_spread_teama$teamaspread,
type = "bar")
p
Here we are averaging the home spread per team. If the bar is above the zero line, then the team is an underdog, and under the line the team is the favorite. Note here that the home teams are favored much more, with the usual suspects having a very large advantage (SAN/GST/OKC)
# spread group by teamb
away_spread_teamb <- ddply(avg_away_spread, c("teamlongb"), summarise,
teambspread = mean(awayspread_avg_teambspread))
# order by teamb
away_spread_teamb$teamlongb <- as.character(away_spread_teamb$teamlongb)
away_spread_teamb <- away_spread_teamb[order(away_spread_teamb$teamlongb), ]
p <- plot_ly(
x = away_spread_teamb$teamlongb,
y = away_spread_teamb$teambspread,
type = "bar")
p
# Here is where we join the Odds/Realtime scores/ Final Scores into one wholistic data set as input for Logistic Machine Learning
# Create a smaller Final Score Dataframe. Just keep the key, final score a and b, the win/loss indicator
finalslicedscoresDF <- finalscoresDF[c("key","fscorea", "fscoreb", "fscorea_fscoreb", "fscoreb_fscorea", "away_win", "home_win")]
# First Join the 2 smallest data frames ... odd and final.
gameDF <- merge(finalslicedscoresDF, oddsDF2, by = "key")
gameDF$teamlonga <- NULL
gameDF$teamlongb <- NULL
gameDF$teama <- NULL
gameDF$teamb <- NULL
# Print Out the Game Dataframe ... notice we have the odds data merged with the win loss data ....
print("gameDF")
## [1] "gameDF"
head(gameDF)
## key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea away_win home_win dateStr teamaspread overunder teamaml teambml teambspread teama_vegas_fscore teamb_vegas_fscore
## 1 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4.0 200.5 155 -175 -4.0 98.25 102.25
## 2 2016-04-05.chi.mem 92 108 -16 16 0 1 2016-04-05 -3.0 201.5 -150 130 3.0 102.25 99.25
## 3 2016-04-05.cle.mil 109 80 29 -29 1 0 2016-04-05 -7.5 203.0 -340 280 7.5 105.25 97.75
## 4 2016-04-05.det.mia 89 107 -18 18 0 1 2016-04-05 4.0 202.0 160 -190 -4.0 99.00 103.00
## 5 2016-04-05.lal.lac 81 103 -22 22 0 1 2016-04-05 14.5 208.0 -110 -110 -14.5 96.75 111.25
## 6 2016-04-05.nor.phi 93 107 -14 14 0 1 2016-04-05 2.5 207.0 125 -145 -2.5 102.25 104.75
paste("total games collected:", nrow(gameDF)) #103
## [1] "total games collected: 103"
# Here we show that the better a team is (negative spread, the more they are likely to win ...)
#Here the spread at the start of the game is a decent predictor regarding the end result
# Final Score Difference vs Spread
# Top Left indicates teams with a large pos spread will lose by a wider margin
# the line should approx pass through 0,0
# lower Right indicates teams with large neg spread will win by a wider margin
# The logistic and linear models we build will quantify this for us later!
scatterD3(x = gameDF$fscoreb_fscorea, y = gameDF$teamaspread)
# Here we can show another weak correlation of the vegas overunder/spread to the actual final outcome.
# vegas_fscore was calculated by taking overunder/2 +- the spread/2 to get a projection of
# the home/away teams score
# Here if the prediction and data were perfectly correlated, we would pass through the
# y=x line. in general we follow that path
# we will see how this term plays when we dig into the linear model
# here only home team is shown, but same trend holds for away team
scatterD3(x = gameDF$teamb_vegas_fscore, y = gameDF$fscoreb)
# This is the bigger merge. Merging the odds/final score data with the real time indicators ...
lrDF <- merge(gameDF, rtscoresDF, by = "key")
print("lrDF : Logistic Regression Data Frame")
## [1] "lrDF : Logistic Regression Data Frame"
head(lrDF)
## key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea away_win home_win dateStr teamaspread overunder teamaml teambml teambspread teama_vegas_fscore teamb_vegas_fscore dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid teama teamb scorea_scoreb date scoreb_scorea pct_complete pct_left cf1 cf2
## 1 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4 200.5 155 -175 -4 98.25 102.25 2016-04-05 20:32:49 Charlotte 42 Toronto 52 (1:12 IN 2ND) 25.20000 400829043 cha tor -10 2016-04-05 10 47.500000 52.50000 6.884284 0.14286204
## 2 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4 200.5 155 -175 -4 98.25 102.25 2016-04-05 20:56:38 Charlotte 44 Toronto 58 (8:56 IN 3RD) 20.93333 400829043 cha tor -14 2016-04-05 14 56.388889 43.61111 10.569592 0.25452584
## 3 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4 200.5 155 -175 -4 98.25 102.25 2016-04-05 20:08:41 Charlotte 19 Toronto 28 (11:08 IN 2ND) 35.13333 400829043 cha tor -9 2016-04-05 9 26.805556 73.19444 5.250891 0.08348447
## 4 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4 200.5 155 -175 -4 98.25 102.25 2016-04-05 20:26:10 Charlotte 35 Toronto 41 (4:11 IN 2ND) 28.18333 400829043 cha tor -6 2016-04-05 6 41.284722 58.71528 3.906817 0.07411763
## 5 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4 200.5 155 -175 -4 98.25 102.25 2016-04-05 20:32:34 Charlotte 40 Toronto 52 (1:30 IN 2ND) 25.50000 400829043 cha tor -12 2016-04-05 12 46.875000 53.12500 8.212631 0.16881814
## 6 2016-04-05.cha.tor 90 96 -6 6 0 1 2016-04-05 4 200.5 155 -175 -4 98.25 102.25 2016-04-05 19:49:35 Charlotte 9 Toronto 12 (7:18 IN 1ST) 43.30000 400829043 cha tor -3 2016-04-05 3 9.791667 90.20833 1.577128 0.02120872
paste("total data points collected:", nrow(lrDF)) #13412
## [1] "total data points collected: 13412"
# Add an overunder/spread adjusted projection as points are scored during the game
# I found this is a strong indicator
lrDF$teama_adj_fscore <- ((lrDF$pct_complete * -1)/100 + 1) * lrDF$teama_vegas_fscore + lrDF$scorea
lrDF$teamb_adj_fscore <- ((lrDF$pct_complete * -1)/100 + 1) * lrDF$teamb_vegas_fscore + lrDF$scoreb
lrDF$pfscoreb_pfscorea <- lrDF$teamb_adj_fscore - lrDF$teama_adj_fscore
# There is an issue with the data I had captured. When a quarter transitions from 1st->2nd (etc,etc), sometime the timestring doesn't get updated properly. Since I used the timestring to calculate the timeleft in the game, I would get some rogue data points.
# Example, after 1 min in a game, something the two teams would have scores in the 20's, because it was really at 11 mins in the second quarter.
# My solution was to use the final score sum, and then just scale that down to the time left in the game. I would then compare to the sum of scores i had, and if it was significantly higher, I would remove them. I did this by visual inspection ...
# dfa = departure_from_avg
lrDF$dfa <- (lrDF$fscorea + lrDF$fscoreb)/48 * (lrDF$timeleft * -1 + 48) - (lrDF$scorea + lrDF$scoreb)
lrDF_filtered <- filter(lrDF, dfa > -30)
summary(lrDF_filtered)
## key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea away_win home_win dateStr teamaspread overunder teamaml teambml teambspread teama_vegas_fscore teamb_vegas_fscore dateOrig ts teamlonga scorea teamlongb scoreb timestring timeleft gameid teama teamb scorea_scoreb date scoreb_scorea pct_complete pct_left cf1 cf2 teama_adj_fscore teamb_adj_fscore pfscoreb_pfscorea dfa
## Length:13217 Min. : 68.00 Min. : 80.0 Min. :-38.00 Min. :-29.00 Min. :0.000 Min. :0.000 2016-04-13:1829 Min. :-13.000 Min. :180.5 Min. :-553.33 Min. :-750.0 Min. :-19.000 Min. : 84.38 Min. : 84.92 Min. :2016-04-05 Length:13217 Length:13217 Min. : 0.00 Length:13217 Min. : 0.00 Length:13217 Min. : 0.00 Length:13217 cha : 796 bos : 889 Min. :-44.000 Min. :2016-04-05 Min. :-33.000 Min. : 0.00 Min. : 0.00 Min. :-290.000 Min. :-11545.108 Min. : 66.42 Min. : 75.50 Min. :-34.915 Min. :-26.7865
## Class :character 1st Qu.: 92.00 1st Qu.: 97.0 1st Qu.:-15.00 1st Qu.: -5.00 1st Qu.:0.000 1st Qu.:0.000 2016-04-05:1452 1st Qu.: -3.500 1st Qu.:200.3 1st Qu.:-150.00 1st Qu.:-230.0 1st Qu.: -9.500 1st Qu.: 97.17 1st Qu.:100.50 1st Qu.:2016-04-08 Class :character Class :character 1st Qu.: 28.00 Class :character 1st Qu.: 29.00 Class :character 1st Qu.:10.07 Class :character san : 716 mia : 804 1st Qu.:-11.000 1st Qu.:2016-04-08 1st Qu.: -3.000 1st Qu.: 28.65 1st Qu.: 20.97 1st Qu.: -2.362 1st Qu.: -0.044 1st Qu.: 93.50 1st Qu.: 97.55 1st Qu.: -3.841 1st Qu.: -3.7944
## Mode :character Median : 99.00 Median :105.0 Median : -8.00 Median : 8.00 Median :0.000 Median :1.000 2016-04-08:1448 Median : 4.833 Median :205.5 Median : 7.50 Median :-127.5 Median : -4.833 Median :100.25 Median :104.50 Median :2016-04-11 Mode :character Mode :character Median : 53.00 Mode :character Median : 56.00 Mode :character Median :22.73 Mode :character okc : 671 dal : 687 Median : -3.000 Median :2016-04-11 Median : 3.000 Median : 52.64 Median : 47.36 Median : 1.821 Median : 0.033 Median : 99.58 Median :104.41 Median : 5.383 Median : 0.6389
## Mean : 99.71 Mean :105.3 Mean : -5.59 Mean : 5.59 Mean :0.353 Mean :0.647 2016-04-11:1233 Mean : 3.518 Mean :204.8 Mean : 26.41 Mean :-122.4 Mean : -3.518 Mean :100.66 Mean :104.18 Mean :2016-04-11 Mean : 52.85 Mean : 56.14 Mean :22.40 cle : 639 tor : 639 Mean : -3.293 Mean :2016-04-11 Mean : 3.293 Mean : 53.32 Mean : 46.68 Mean : 4.799 Mean : 31.508 Mean : 99.85 Mean :104.79 Mean : 4.938 Mean : 0.4491
## 3rd Qu.:107.00 3rd Qu.:113.0 3rd Qu.: 5.00 3rd Qu.: 15.00 3rd Qu.:1.000 3rd Qu.:1.000 2016-04-10:1168 3rd Qu.: 9.500 3rd Qu.:209.5 3rd Qu.: 190.00 3rd Qu.: 115.0 3rd Qu.: 3.500 3rd Qu.:103.50 3rd Qu.:108.75 3rd Qu.:2016-04-14 3rd Qu.: 77.00 3rd Qu.: 82.00 3rd Qu.:34.25 tor : 578 hou : 593 3rd Qu.: 3.000 3rd Qu.:2016-04-13 3rd Qu.: 11.000 3rd Qu.: 79.03 3rd Qu.: 71.35 3rd Qu.: 9.017 3rd Qu.: 0.244 3rd Qu.:106.12 3rd Qu.:111.73 3rd Qu.: 14.159 3rd Qu.: 5.0549
## Max. :131.00 Max. :144.0 Max. : 29.00 Max. : 38.00 Max. :1.000 Max. :1.000 2016-04-06:1108 Max. : 19.000 Max. :225.2 Max. : 541.67 Max. : 410.0 Max. : 13.000 Max. :115.75 Max. :119.12 Max. :2016-04-24 Max. :131.00 Max. :144.00 Max. :48.00 mem : 577 ind : 586 Max. : 33.000 Max. :2016-04-24 Max. : 44.000 Max. :100.00 Max. :100.00 Max. : 380.000 Max. : 15128.072 Max. :135.35 Max. :146.36 Max. : 47.362 Max. : 21.3115
## (Other) :4979 (Other):9240 (Other):9019
# here we can see the trajectory of some of the games .....
# upper left beginning ... upper right (win), lower right (loss)
# cool visual .... gives an idea about how the games flow
tsplot <- filter(lrDF_filtered, grepl("cle", key) | grepl("gst", key))
scatterD3(x = tsplot$pct_complete, y = tsplot$scoreb_scorea, col_var = tsplot$key)
DQ_check <- ddply(lrDF_filtered, c("key"), summarise,
N = length(key))
# order by N
DQ_check <- DQ_check[order(DQ_check$N),]
# plot
p <- plot_ly(
x = DQ_check$Key,
y = DQ_check$N,
type = "bar")
p
# Wanted to save out the dataset at this point as I will branch into seperate work efforts for a Logistic/Linear model building
# drop some columns as we move on to next step !!
lrDF_final <- lrDF_filtered
lrDF_final$dateOrig <- NULL
lrDF_final$ts <- NULL
lrDF_final$teamlonga <- NULL
lrDF_final$teamlongb <- NULL
lrDF_final$timestring <- NULL
lrDF_final$gameid <- NULL
lrDF_final$teamaml <- NULL
lrDF_final$teambml <- NULL
lrDF_final$dfa <- NULL
lrDF_final$teama <- NULL
lrDF_final$teamb <- NULL
lrDF_final$dateStr <- NULL
names(lrDF_final)
## [1] "key" "fscorea" "fscoreb" "fscorea_fscoreb" "fscoreb_fscorea" "away_win" "home_win" "teamaspread" "overunder" "teambspread" "teama_vegas_fscore" "teamb_vegas_fscore" "scorea" "scoreb" "timeleft" "scorea_scoreb" "date" "scoreb_scorea" "pct_complete" "pct_left" "cf1" "cf2" "teama_adj_fscore" "teamb_adj_fscore" "pfscoreb_pfscorea"
head(lrDF_final)
## key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea away_win home_win teamaspread overunder teambspread teama_vegas_fscore teamb_vegas_fscore scorea scoreb timeleft scorea_scoreb date scoreb_scorea pct_complete pct_left cf1 cf2 teama_adj_fscore teamb_adj_fscore pfscoreb_pfscorea
## 1 2016-04-05.cha.tor 90 96 -6 6 0 1 4 200.5 -4 98.25 102.25 42 52 25.20000 -10 2016-04-05 10 47.500000 52.50000 6.884284 0.14286204 93.58125 105.6813 12.100000
## 2 2016-04-05.cha.tor 90 96 -6 6 0 1 4 200.5 -4 98.25 102.25 44 58 20.93333 -14 2016-04-05 14 56.388889 43.61111 10.569592 0.25452584 86.84792 102.5924 15.744444
## 3 2016-04-05.cha.tor 90 96 -6 6 0 1 4 200.5 -4 98.25 102.25 19 28 35.13333 -9 2016-04-05 9 26.805556 73.19444 5.250891 0.08348447 90.91354 102.8413 11.927778
## 4 2016-04-05.cha.tor 90 96 -6 6 0 1 4 200.5 -4 98.25 102.25 35 41 28.18333 -6 2016-04-05 6 41.284722 58.71528 3.906817 0.07411763 92.68776 101.0364 8.348611
## 5 2016-04-05.cha.tor 90 96 -6 6 0 1 4 200.5 -4 98.25 102.25 40 52 25.50000 -12 2016-04-05 12 46.875000 53.12500 8.212631 0.16881814 92.19531 106.3203 14.125000
## 6 2016-04-05.cha.tor 90 96 -6 6 0 1 4 200.5 -4 98.25 102.25 9 12 43.30000 -3 2016-04-05 3 9.791667 90.20833 1.577128 0.02120872 97.62969 104.2380 6.608333
write.csv(lrDF_final, file = "nba-datawrangle-lrDF.csv")